library(readxl)
library(readr)
library(CTexploreR)
library(Vennerable)
library(biomaRt)
library(tidyverse)
library(SummarizedExperiment)
library(UpSetR)
Gene names/synonyms required for databases cleaning
ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
attributes_vector <- c("ensembl_gene_id", "external_gene_name",
"external_synonym", "gene_biotype",
"chromosome_name", "band", "start_position", "end_position",
"strand")
ensembl_gene_synonym <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))
ensembl_gene_synonym <- ensembl_gene_synonym %>%
mutate(external_synonym = sub(pattern = "ORF", external_synonym,
replacement = "orf"))
attributes_vector <- c("ensembl_gene_id", "external_gene_name")
ensembl_gene_names <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))
attributes_vector <- c("external_gene_name",
"external_synonym")
gene_synonym <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))
load("../CTdata/eh_data/all_genes.rda")
load("../CTdata/eh_data/CT_genes.rda")
CT_and_CTP_genes <- CT_genes
CT_genes <- (filter(CT_genes, CT_gene_type == "CT_gene"))
upset_text_size <- c(1.5, 1.5, 1.5, 1.5, 1.8, 2)
#c(intersection size title, intersection size tick labels, set size title,
# set size tick labels, set names, numbers above bars)
CT lists from other databases have been checked (using GTEx and our
GTEx_expression() funtion and GeneCards) in order to remove
duplicated gene names or deprecated ones and allow comparison between
databases.
Online list copied in a csv file, several lists exist so we combined them.
We checked gene names that were a concatenation of two genes (choice using biomaRt synonyms to get the official one), checked which ones had the right names, removed duplicated genes, verified lost genes and added back those that should be there.
CTdatabase <- read_delim("data/CTdatabase1.csv", delim = ";",
escape_double = FALSE, trim_ws = TRUE)
colnames(CTdatabase) <- c("Family", "Gene_Name", "Chromosomal_localization",
"CT_identifier")
CTdatabase_bis <- read_csv2("data/CTdatabase2.csv")
CTdatabase <- left_join(CTdatabase, CTdatabase_bis,
by = c("Gene_Name" = "Gene_Symbol"))
CTdatabase_single <- CTdatabase %>%
mutate(Gene_Name = sub(pattern = "/.*$", Gene_Name, replacement = ""))
CTdatabase_single <- CTdatabase_single %>%
mutate(Gene_Name = sub(pattern = ",.*$", Gene_Name, replacement = ""))
CTdatabase_official_names <-
unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id,
external_gene_name)) %>%
filter(external_gene_name %in% CTdatabase_single$Gene_Name) %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA)
CTdatabase_synonym <-
ensembl_gene_synonym %>%
filter(external_synonym %in% CTdatabase_single$Gene_Name) %>%
mutate(Gene_Name = external_synonym) %>%
dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym)
CTdatabase_cleaned <-
rbind(CTdatabase_official_names, CTdatabase_synonym) %>%
left_join(CTdatabase_single)
duplicated_genes <- CTdatabase_cleaned$Gene_Name[duplicated(CTdatabase_cleaned$Gene_Name)]
bad_ids <- ensembl_gene_synonym %>%
filter(external_gene_name %in% duplicated_genes | external_synonym %in% duplicated_genes) %>%
filter(chromosome_name %in% grep(pattern = "H", x = chromosome_name, value = TRUE)) %>%
pull(ensembl_gene_id)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
dplyr::filter(!ensembl_gene_id %in% bad_ids)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
filter(!ensembl_gene_id == "ENSG00000052126")
CTdatabase_cleaned <- CTdatabase_cleaned %>%
filter(!(ensembl_gene_id == "ENSG00000183305" & Gene_Name == "MAGEA2"))
CTdatabase_cleaned <- CTdatabase_cleaned %>%
filter(!ensembl_gene_id == "ENSG00000204648")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CSAG3B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CSAG2", "external_synonym"] <- "CSAG3B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT45A4")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CT45A3", "external_synonym"] <- "CT45A4"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "LAGE-1b")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAG2", "external_synonym"] <- "LAGE-1b"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT16.2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "PAGE5", "external_synonym"] <- "CT16.2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXB2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXB1", "external_synonym"] <- "SPANXB2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXE")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXD", "external_synonym"] <- "SPANXE"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1C")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1D")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1E")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE2B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "XAGE2", "external_synonym"] <- "XAGE2B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CTAGE-2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAGE1", "external_synonym"] <- "CTAGE-2"
CTdatabase_cleaned <- ensembl_gene_synonym %>%
mutate(Gene_Name = external_synonym) %>%
filter(external_synonym == "CXorf61") %>%
dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "Cxorf61",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name)) %>%
filter(external_gene_name == "CCNA1") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "cyclin A1",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "GOLGA6L2") %>%
filter(ensembl_gene_id == "ENSG00000174450") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "GOLGAGL2 FA",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "LYPD6B") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC130576",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "CT62") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC196993",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "CT75") %>%
filter(ensembl_gene_id == "ENSG00000291155") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC440934",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "LINC01192") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC647107",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "TSPY1") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC728137",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "SSX2B") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "SSX2b",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- CTdatabase_cleaned[!duplicated(CTdatabase_cleaned$external_gene_name), ]
length(CTdatabase_cleaned$ensembl_gene_id[!CTdatabase_cleaned$ensembl_gene_id%in%all_genes$ensembl_gene_id])
## [1] 9
9 genes are not found in all_genes even when using gene names and ensembl_gene_id
Excel file coming from supplemental data.
Jamin_core_CT <- read_excel("data/Jamin_core_CT.xlsx")
Jamin_core_CT[Jamin_core_CT$Gene == "KIAA1211", "Gene"] <- "CRACD"
Jamin_core_CT[Jamin_core_CT$Gene == "CXorf67", "Gene"] <- "EZHIP"
Jamin_core_CT[Jamin_core_CT$Gene == "KIAA1109", "Gene"] <- "BLTP1"
Jamin_core_CT[Jamin_core_CT$Gene == "C10orf82", "Gene"] <- "SPMIP5"
Jamin_core_CT[Jamin_core_CT$Gene == "TTC30B", "Gene"] <- "IFT70B"
Jamin_core_CT[Jamin_core_CT$Gene == "TTC30A", "Gene"] <- "IFT70A"
All gene names have been linked to the right name in all_genes
Excel file coming from supplemental data.
Wang_CT <- read_excel("data/Wang_Suppl_Data_3.xlsx",
sheet = "Supplementary Data 3B", skip = 1)
colnames(Wang_CT)[1] <- "ensembl_gene_id"
Wang_CT <- ensembl_gene_names %>%
filter(ensembl_gene_id %in% Wang_CT$ensembl_gene_id) %>%
right_join(Wang_CT)
Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000181013", "external_gene_name"] <- "C17orf47"
Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000204293", "external_gene_name"] <- "OR8B2"
length(Wang_CT$external_gene_name[!Wang_CT$external_gene_name %in% all_genes$external_gene_name])
## [1] 12
12 genes are not found in all_genes, by gene names or ensemble gene id and no other name.
Carter_CT_list <- read_table("data/Carter_CT_list.txt", skip = 1)
Carter_CT <- Carter_CT_list[Carter_CT_list$CT_Expression,]
Carter_CT[Carter_CT$Gene == "ENSG00000261649", "Gene_Name"] <- "GOLGA6L7"
Carter_CT[Carter_CT$Gene == "ENSG00000239620", "Gene_Name"] <- "PRR20G"
Carter_CT[Carter_CT$Gene == "ENSG00000168148", "Gene_Name"] <- "H3-4"
Carter_CT[Carter_CT$Gene == "ENSG00000204296", "Gene_Name"] <- "TSBP1"
Carter_CT[Carter_CT$Gene == "ENSG00000180219", "Gene_Name"] <- "GARIN6"
Carter_CT[Carter_CT$Gene == "ENSG00000172717", "Gene_Name"] <- "GARIN2"
Carter_CT[Carter_CT$Gene == "ENSG00000174015", "Gene_Name"] <- "CBY2"
Carter_CT[Carter_CT$Gene == "ENSG00000224960", "Gene_Name"] <- "PPP4R3C"
Carter_CT[Carter_CT$Gene == "ENSG00000221843", "Gene_Name"] <- "SPATA31H1"
Carter_CT[Carter_CT$Gene == "ENSG00000177947", "Gene_Name"] <- "CIMAP1A"
Carter_CT[Carter_CT$Gene == "ENSG00000172073", "Gene_Name"] <- "SPMIP9"
Carter_CT[Carter_CT$Gene == "ENSG00000229894", "Gene_Name"] <- "GK3"
Carter_CT[Carter_CT$Gene == "ENSG00000249693", "Gene_Name"] <- "SPMAP2L"
Carter_CT[Carter_CT$Gene == "ENSG00000173728", "Gene_Name"] <- "SPMIP3"
length(Carter_CT$Gene[!Carter_CT$Gene %in% all_genes$ensembl_gene_id])
## [1] 1
1 gene cannot be found in all_genes even by ensemble or gene name
Excel file from supplemental data.
Bruggeman_data <- read_excel("data/Bruggeman_suppl_data.xlsx", skip = 1,
sheet = "1D")
Bruggeman_official_names <- gene_synonym %>%
dplyr::select(external_gene_name) %>%
unique() %>%
filter(external_gene_name %in% Bruggeman_data$Gene) %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA)
Bruggeman_synonym <- gene_synonym %>%
filter(external_synonym %in% Bruggeman_data$Gene) %>%
mutate(Gene_Name = external_synonym) %>%
dplyr::select(external_gene_name, Gene_Name, external_synonym)
Bruggeman_synonym <- Bruggeman_synonym[-which(Bruggeman_synonym$Gene_Name %in%
Bruggeman_official_names$Gene_Name),]
Bruggeman_CT <- rbind(Bruggeman_official_names, Bruggeman_synonym)
lost <- Bruggeman_data[which(!Bruggeman_data$Gene %in%
c(Bruggeman_CT$Gene_Name)), "Gene"]
colnames(lost) <- "external_gene_name"
lost$Gene_Name <- rep(NA, nrow(lost))
lost$external_synonym <- rep(NA, nrow(lost))
lost[lost$external_gene_name == "C21orf59", "Gene_Name"] <- "CFAP298"
lost[lost$external_gene_name == "C11orf57", "Gene_Name"] <- "NKAPD1"
lost[lost$external_gene_name == "C7orf55", "Gene_Name"] <- "FMC1"
lost[lost$external_gene_name == "C10orf12", "Gene_Name"] <- "LCOR"
lost[lost$external_gene_name == "RPL19P12", "Gene_Name"] <- "RPL19P12"
lost[lost$external_gene_name == "C16orf59", "Gene_Name"] <- "TEDC2"
lost[lost$external_gene_name == "TTTY15", "Gene_Name"] <- "USP9Y"
lost[lost$external_gene_name == "C17orf53", "Gene_Name"] <- "HROB"
lost[lost$external_gene_name == "C1orf112", "Gene_Name"] <- "FIRRM"
lost[lost$external_gene_name == "C12orf66", "Gene_Name"] <- "KICS2"
lost[lost$external_gene_name == "C9orf84", "Gene_Name"] <- "SHOC1"
lost[lost$external_gene_name == "C10orf25", "Gene_Name"] <- "ZNF22-AS1"
lost[lost$external_gene_name == "C20orf197", "Gene_Name"] <- "LINC02910"
lost[lost$external_gene_name == "C3orf67", "Gene_Name"] <- "CFAP20DC"
lost[lost$external_gene_name == "C8orf37", "Gene_Name"] <- "CFAP418"
lost[lost$external_gene_name == "C22orf34", "Gene_Name"] <- "MIR3667HG"
lost[lost$external_gene_name == "C9orf131", "Gene_Name"] <- "SPATA31G1"
Bruggeman_CT <- rbind(Bruggeman_CT, lost)
missing_Bruggeman <- Bruggeman_CT[!Bruggeman_CT$Gene_Name %in%
all_genes$external_gene_name, ]$Gene_Name
external_names_to_keep <- gene_synonym %>%
filter(external_synonym %in% missing_Bruggeman) %>%
filter(external_gene_name %in% all_genes$external_gene_name) %>%
mutate(Gene_Name = external_gene_name)
Bruggeman_CT[Bruggeman_CT$external_synonym %in%
external_names_to_keep$external_synonym,
"Gene_Name"] <- external_names_to_keep$Gene_Name
Bruggeman_CT <- Bruggeman_CT %>%
dplyr::select(Gene_Name)
length(Bruggeman_CT$Gene_Name[!Bruggeman_CT$Gene_Name %in%
all_genes$external_gene_name])
## [1] 38
38 genes are not found with their names in all genes
To characterise the differences between our database and other, we
need the category we created in CTexploreR. For this, we have the object
all_genes in CTdata that contains the CT analysis for all
genes. More info in
Hereunder is what we used for our selection pipeline (coming from
make_all_genes_prelim.R and
130_make_all_genes_and_CT_genes.R in
CTdata).
all_genes
From there, we filtered based on the testis_specificity (“testis_specific”, which is based on expression in health tissue and scRNA seq info from HPA), CCLE_category (“activated”) and TCGA_category (“activated” or “multimapping_issue”) to have our CT genes. Then, when wanting to validate TSS manually, we realised that for some genes, reads were not properly aligned to exons which might reflect a poorly defined transcription in these regions and are hence likely unreliable.
Some genes were also characterized as Cancer-Testis preferential genes when testis specificity was less stringent
CTdatabase_ours <- Venn(list(CTexploreR = CT_genes$external_gene_name,
CTdatabase = CTdatabase_cleaned$external_gene_name))
gp <- VennThemes(compute.Venn(CTdatabase_ours))
gp[["Face"]][["11"]]$fill <- "mediumaquamarine"
gp[["Face"]][["10"]]$fill <- "darkseagreen1"
gp[["Face"]][["01"]]$fill <- "lightsteelblue1"
gp[["Set"]][["Set2"]]$col <- "paleturquoise4"
gp[["Set"]][["Set1"]]$col <- "darkseagreen4"
gp[["SetText"]][["Set2"]]$col <- "paleturquoise4"
gp[["SetText"]][["Set1"]]$col <- "darkseagreen4"
plot(CTdatabase_ours, gp = gp)
We find 29.2682927 % of CTdatabase in CTexploreR, which is 40.4494382% of our database.
Lost genes analysis
CTdatabase_lost <- all_genes %>%
filter(external_gene_name %in% CTdatabase_ours@IntersectionSets[["01"]])
# 9 Genes are lost because not in any database like before
table(CTdatabase_lost$testis_specificity)
##
## not_testis_specific testis_preferential testis_specific
## 101 29 35
table(CTdatabase_lost$CT_gene_type)
##
## CTP_gene other
## 24 141
table(CTdatabase_lost$not_detected_in_somatic_HPA)
##
## FALSE TRUE
## 51 79
table(CTdatabase_lost$TCGA_category)
##
## activated leaky multimapping_issue not_activated
## 83 29 25 28
table(CTdatabase_lost$CCLE_category)
##
## activated leaky not_activated
## 88 26 51
table(CTdatabase_lost$TCGA_category, CTdatabase_lost$CCLE_category)
##
## activated leaky not_activated
## activated 63 1 19
## leaky 4 25 0
## multimapping_issue 15 0 10
## not_activated 6 0 22
CTdatabase_lost_upset <-
list(`Not testis specific` =
filter(CTdatabase_lost,
testis_specificity != "testis_specific" &
CT_gene_type == "other")$external_gene_name,
`Not tumour activated` =
filter(CTdatabase_lost,
(TCGA_category != "activated" &
TCGA_category != "multimapping_issue")|
CCLE_category != "activated")$external_gene_name,
`CT preferential` =
filter(CTdatabase_lost,
CT_gene_type == "CTP_gene")$external_gene_name)
upset_CTdatabase <- fromList(CTdatabase_lost_upset)
upset(upset_CTdatabase,
text.scale = upset_text_size)
78.7878788 % of these genes are not testis specific.
However 24 of these lost genes are flagged as Cancer-Testis preferential in our analysis.
61.8181818 % are not properly activated in tumors and/or cancer cell lines.
In their analysis, they had characterised gene specificity, some being not available, not found in testis, testis-restricted, testis-selective and testis/brain-restricted. Let’s see how the lost genes qualify as they didn’t mention those were strictly testis specific.
CTdatabase_cleaned %>%
filter(external_gene_name %in% CTdatabase_lost$external_gene_name) %>%
pull(Classification) %>%
table()
## .
## not available not found in testis testis-restricted
## 90 2 15
## testis-selective testis/brain-restricted
## 51 7
We can see that most of them had no info or were testis-selective (I couldn’t find on website or paper how they selected categories).
core_ours <- Venn(list(CTexploreR = CT_genes$external_gene_name,
Jamin = Jamin_core_CT$Gene))
Wang_ours <- Venn(list(CTexploreR = CT_genes$external_gene_name,
Wang = Wang_CT$external_gene_name))
Carter_ours <- Venn(list(CTexploreR = CT_genes$external_gene_name,
Carter_CT = Carter_CT$Gene_Name))
Bruggeman_ours <- Venn(list(CTexploreR = CT_genes$external_gene_name,
Bruggeman = Bruggeman_CT$Gene_Name))
gene_list <- list(CTexploreR = CT_genes$external_gene_name,
Carter = Carter_CT$Gene_Name,
Jamin = Jamin_core_CT$Gene,
CTatlas = Wang_CT$external_gene_name,
Bruggeman = Bruggeman_CT$Gene_Name)
upset_omics <- fromList(gene_list)
upset(upset_omics)
4 in all, 60 in at least 3 databases
Lost genes analysis
plot(core_ours, gp = gp)
Jamin_lost <- all_genes %>%
filter(external_gene_name %in% core_ours@IntersectionSets[["01"]])
table(Jamin_lost$testis_specificity)
##
## not_testis_specific testis_preferential testis_specific
## 81 13 2
table(Jamin_lost$CT_gene_type)
##
## CTP_gene other
## 11 85
table(Jamin_lost$not_detected_in_somatic_HPA)
##
## FALSE TRUE
## 45 34
table(Jamin_lost$TCGA_category)
##
## activated leaky multimapping_issue not_activated
## 32 42 17 5
table(Jamin_lost$CCLE_category)
##
## activated leaky not_activated
## 57 35 4
table(Jamin_lost$TCGA_category, Jamin_lost$CCLE_category)
##
## activated leaky not_activated
## activated 28 2 2
## leaky 9 33 0
## multimapping_issue 17 0 0
## not_activated 3 0 2
Jamin_lost_upset <-
list(`Not testis specific` =
filter(Jamin_lost,
testis_specificity != "testis_specific" &
CT_gene_type == "other")$external_gene_name,
`Not tumour activated` =
filter(Jamin_lost,
(TCGA_category != "activated" &
TCGA_category != "multimapping_issue")|
CCLE_category != "activated")$external_gene_name,
`CT preferential` =
filter(Jamin_lost,
CT_gene_type == "CTP_gene")$external_gene_name)
upset_Jamin <- fromList(Jamin_lost_upset)
upset(upset_Jamin,
text.scale = upset_text_size)
We find 23.2 % of CTdatabase in CTexploreR, which is 16.2921348 % of our database.
97.9166667% of these genes are not testis specific.
However 11 of these lost genes are flagged as Cancer-Testis preferential in our analysis.
70.8333333 % are not properly activated in tumors and/or cancer cell lines.
plot(Wang_ours, gp = gp)
Wang_lost <- all_genes %>%
filter(external_gene_name %in% Wang_ours@IntersectionSets[["01"]])
# 12 genes lost because no info like before
table(Wang_lost$testis_specificity)
##
## not_testis_specific testis_preferential testis_specific
## 574 141 200
table(Wang_lost$CT_gene_type)
##
## CTP_gene other
## 72 843
table(Wang_lost$not_detected_in_somatic_HPA)
##
## FALSE TRUE
## 254 588
table(Wang_lost$TCGA_category)
##
## activated leaky multimapping_issue not_activated
## 473 163 66 213
table(Wang_lost$CCLE_category)
##
## activated leaky not_activated
## 407 121 387
table(Wang_lost$TCGA_category, Wang_lost$CCLE_category)
##
## activated leaky not_activated
## activated 286 7 180
## leaky 42 112 9
## multimapping_issue 34 0 32
## not_activated 45 2 166
Wang_lost_upset <-
list(`Not testis specific` =
filter(Wang_lost,
testis_specificity != "testis_specific" &
CT_gene_type == "other")$external_gene_name,
`Not tumour activated` =
filter(Wang_lost,
(TCGA_category != "activated" &
TCGA_category != "multimapping_issue")|
CCLE_category != "activated")$external_gene_name,
`CT preferential` =
filter(Wang_lost,
CT_gene_type == "CTP_gene")$external_gene_name)
upset_Wang <- fromList(Wang_lost_upset)
upset(upset_Wang,
text.scale = upset_text_size)
We find 9.0284593 % of CTdatabase in CTexploreR, which is 51.6853933 % of our database.
78.1420765% of these genes are not testis specific.
However 72 of these lost genes are flagged as Cancer-Testis preferential in our analysis.
68.7431694 % are not properly activated in tumors and/or cancer cell lines.
plot(Carter_ours, gp = gp)
Carter_lost <- all_genes %>%
filter(external_gene_name %in% Carter_ours@IntersectionSets[["01"]])
# 1 lost because no info, like before
table(Carter_lost$testis_specificity)
##
## not_testis_specific testis_preferential testis_specific
## 22 9 30
table(Carter_lost$CT_gene_type)
##
## CTP_gene other
## 5 56
table(Carter_lost$not_detected_in_somatic_HPA)
##
## FALSE TRUE
## 13 42
table(Carter_lost$TCGA_category)
##
## activated leaky not_activated
## 41 1 19
table(Carter_lost$CCLE_category)
##
## activated not_activated
## 21 40
table(Carter_lost$TCGA_category, Carter_lost$CCLE_category)
##
## activated not_activated
## activated 17 24
## leaky 0 1
## not_activated 4 15
Carter_lost_upset <-
list(`Not testis specific` =
filter(Carter_lost,
testis_specificity != "testis_specific" &
CT_gene_type == "other")$external_gene_name,
`Not tumour activated` =
filter(Carter_lost,
(TCGA_category != "activated" &
TCGA_category != "multimapping_issue")|
CCLE_category != "activated")$external_gene_name,
`CT preferential` =
filter(Carter_lost,
CT_gene_type == "CTP_gene")$external_gene_name)
upset_Carter <- fromList(Carter_lost_upset)
upset(upset_Carter,
text.scale = upset_text_size)
We find 39.8058252 % of CTdatabase in CTexploreR, which is 23.0337079 % of our database.
50.8196721% of these genes are not testis specific.
However 5 of these lost genes are flagged as Cancer-Testis preferential in our analysis.
72.1311475 % are not properly activated in tumors and/or cancer cell lines.
plot(Bruggeman_ours, gp = gp)
Bruggeman_lost <- all_genes %>%
filter(external_gene_name %in% Bruggeman_ours@IntersectionSets[["01"]])
# 38 lost, like before
table(Bruggeman_lost$testis_specificity)
##
## not_testis_specific testis_preferential testis_specific
## 670 26 9
table(Bruggeman_lost$CT_gene_type)
##
## CTP_gene other
## 19 686
table(Bruggeman_lost$not_detected_in_somatic_HPA)
##
## FALSE TRUE
## 441 169
table(Bruggeman_lost$TCGA_category)
##
## activated leaky multimapping_issue not_activated
## 274 382 11 38
table(Bruggeman_lost$CCLE_category)
##
## activated leaky not_activated
## 295 361 49
table(Bruggeman_lost$TCGA_category, Bruggeman_lost$CCLE_category)
##
## activated leaky not_activated
## activated 223 19 32
## leaky 50 324 8
## multimapping_issue 9 0 2
## not_activated 13 18 7
Bruggeman_lost_upset <-
list(`Not testis specific` =
filter(Bruggeman_lost,
testis_specificity != "testis_specific" &
CT_gene_type == "other")$external_gene_name,
`Not tumour activated` =
filter(Bruggeman_lost,
(TCGA_category != "activated" &
TCGA_category != "multimapping_issue")|
CCLE_category != "activated")$external_gene_name,
`CT preferential` =
filter(Bruggeman_lost,
CT_gene_type == "CTP_gene")$external_gene_name)
upset_Bruggeman <- fromList(Bruggeman_lost_upset)
upset(upset_Bruggeman,
text.scale = upset_text_size)
We find 1.7195767 % of CTdatabase in CTexploreR, which is 7.3033708 % of our database.
98.7234043% of these genes are not testis specific.
However 19 of these lost genes are flagged as Cancer-Testis preferential in our analysis.
68.3687943 % are not properly activated in tumors and/or cancer cell lines.
all_database_gene_list <- list(CTexploreR = CT_genes$external_gene_name,
Carter = Carter_CT$Gene_Name,
Jamin = Jamin_core_CT$Gene,
CTatlas = Wang_CT$external_gene_name,
Bruggeman = Bruggeman_CT$Gene_Name,
CTdatabase = CTdatabase_cleaned$external_gene_name)
upset_all_database <- fromList(all_database_gene_list)
upset(upset_all_database,
nsets = 6,
nintersects = 50,
text.scale = upset_text_size)
Venn_all_database <- Venn(all_database_gene_list)
Venn_all_database@IntersectionSets[["111111"]]
## [1] "MAGEB2" "MAGEA4" "MAGEA3"
common <- unique(c(core_ours@IntersectionSets[["11"]],
CTdatabase_ours@IntersectionSets[["11"]],
Wang_ours@IntersectionSets[["11"]],
Carter_ours@IntersectionSets[["11"]],
Bruggeman_ours@IntersectionSets[["11"]]))
length(common)
## [1] 119
length(common)/dim(CT_genes)[1] * 100
## [1] 66.85393
lost_list <- unique(c(core_ours@IntersectionSets[["01"]],
CTdatabase_ours@IntersectionSets[["01"]],
Wang_ours@IntersectionSets[["01"]],
Carter_ours@IntersectionSets[["01"]],
Bruggeman_ours@IntersectionSets[["01"]]))
lost <- all_genes %>%
filter(external_gene_name %in% lost_list)
all_lost_upset <-
list(`Not testis specific` =
filter(lost,
testis_specificity != "testis_specific" &
CT_gene_type == "other")$external_gene_name,
`Not tumour activated` =
filter(lost,
(TCGA_category != "activated" &
TCGA_category != "multimapping_issue")|
CCLE_category != "activated")$external_gene_name,
`CT preferential` =
filter(lost,
CT_gene_type == "CTP_gene")$external_gene_name)
upset_all <- fromList(all_lost_upset)
upset(upset_all,
text.scale = upset_text_size)
not_specific <- filter(lost, testis_specificity == "not_testis_specific")
GTEX_expression(not_specific$external_gene_name, units = "log_TPM")
## Warning: 21 out of 1249 names invalid: FIRRM, CIMIP2C, IFT70B, IFT70A, BLTP1,
## AFG2A, CFAP96, SPMIP10, SCAND3, MTCL3, SPMIP4, ZNG1A, CDKN2A-AS1,
## ZNG1F, CIMIP2A, SPMIP5, SAXO4, CIMAP1C, LIAT1, BLTP2, CIMAP1D.
## See the manual page for valid types.
somatic_testis <- filter(lost, not_detected_in_somatic_HPA == FALSE)
testis_expression(somatic_testis$external_gene_name, cells = "all")
## Warning: 11 out of 711 names invalid: MPL, EDAR, ASB14, F2RL2, PCDHGB2, FOXO3B,
## KRT33B, SEMG1, ADORA2A, DAZ2, DAZ4.
## See the manual page for valid types.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
HPA_cell_type_expression(somatic_testis$external_gene_name)
not_TCGA_activated <- filter(lost, TCGA_category != "activated" &
TCGA_category != "multimapping_issue")
TCGA_expression(not_TCGA_activated$external_gene_name,
tumor = "all",
units = "log_TPM")
## Warning: 17 out of 809 names invalid: FIRRM, IFT70B, IFT70A, SPMAP2L, BLTP1,
## AFG2A, CFAP96, MTCL3, SPMIP4, SPMIP7, ZNG1A, SPATA31F1, ZNG1F, CIMAP1A,
## SAXO4, LIAT1, BLTP2.
## See the manual page for valid types.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
not_CCLE_activated <- filter(lost, CCLE_category != "activated")
CCLE_expression(not_CCLE_activated$external_gene_name,
type = c("lung", "skin", "colorectal",
"gastric", "breast", "head_and_neck"),
units = "log_TPM")
## Warning: 23 out of 932 names invalid: FIRRM, SPATA31H1, IFT70B, IFT70A, SPMAP2L,
## BLTP1, AFG2A, GK3, SPMIP10, SPMIP4, ZNG1A, CDKN2A-AS1, SPATA31F1,
## SPATA31G1, ZNG1F, CIMIP2A, CIMAP1A, REDIC1, CIMAP1C, LIAT1, BLTP2,
## CHCT1, CIMAP1D.
## See the manual page for valid types.
transcript_prob <- lost %>%
filter(testis_specificity == "testis_specific" |
testis_specificity == "testis_preferential") %>%
filter(TCGA_category == "activated" | TCGA_category == "multimapping_issue") %>%
filter(CCLE_category == "activated") %>%
dim()
119 genes in our CTexploreR database are found in at least one of the other database, which represents 66.8539326%.
The 3 genes found in all databases are MAGEB2, MAGEA4, MAGEA3.
We have lost 1691 genes in total. Among them, 73.8616203% are not considered testis specific, 42.0461266% are expressed in somatic cells, 47.8415139% are not activated in TCGA samples, 55.1153164% are not activated in CCLE cell lines and 5.2631579% is lost due to transcripts problems.
What about new genes in CTexploreR
new <- CT_genes %>%
filter(external_gene_name %in% Venn_all_database@IntersectionSets[["100000"]])
new
table(new$testis_specificity)
##
## testis_specific
## 59
table(new$X_linked)
##
## FALSE TRUE
## 48 11
table(new$regulated_by_methylation)
##
## FALSE TRUE
## 24 34
table(new$X_linked, new$regulated_by_methylation)
##
## FALSE TRUE
## FALSE 24 23
## TRUE 0 11
TCGA_expression(tumor = "all", genes = new$external_gene_name,
units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
TCGA_expression(tumor = "all",
genes = filter(new, X_linked & regulated_by_methylation)$external_gene_name,
units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
There are 59 new CT genes in CTexploreR. These are all testis specific and mainly on autosomes. Regulation by methylation is the majority of them. There is only 11 new “major” CT that are on the X chromosome and regulated by methylation. CT45 are not that new.
Expression in tumours doesn’t strike that much.
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] SingleCellExperiment_1.26.0 UpSetR_1.4.0
## [3] SummarizedExperiment_1.34.0 Biobase_2.64.0
## [5] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0
## [7] IRanges_2.38.0 S4Vectors_0.42.0
## [9] BiocGenerics_0.50.0 MatrixGenerics_1.16.0
## [11] matrixStats_1.3.0 lubridate_1.9.3
## [13] forcats_1.0.0 stringr_1.5.1
## [15] dplyr_1.1.4 purrr_1.0.2
## [17] tidyr_1.3.1 tibble_3.2.1
## [19] ggplot2_3.5.1 tidyverse_2.0.0
## [21] biomaRt_2.60.0 Vennerable_3.1.0.9000
## [23] CTexploreR_1.0.0 CTdata_1.4.0
## [25] readr_2.1.5 readxl_1.4.3
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.2 RBGL_1.80.0 gridExtra_2.3
## [4] httr2_1.0.1 rlang_1.1.3 magrittr_2.0.3
## [7] clue_0.3-65 GetoptLong_1.0.5 compiler_4.4.0
## [10] RSQLite_2.3.6 reshape2_1.4.4 png_0.1-8
## [13] vctrs_0.6.5 pkgconfig_2.0.3 shape_1.4.6.1
## [16] crayon_1.5.2 fastmap_1.2.0 magick_2.8.3
## [19] dbplyr_2.5.0 XVector_0.44.0 labeling_0.4.3
## [22] utf8_1.2.4 rmarkdown_2.27 tzdb_0.4.0
## [25] graph_1.82.0 UCSC.utils_1.0.0 bit_4.0.5
## [28] xfun_0.44 zlibbioc_1.50.0 cachem_1.1.0
## [31] jsonlite_1.8.8 progress_1.2.3 blob_1.2.4
## [34] highr_0.10 DelayedArray_0.30.1 prettyunits_1.2.0
## [37] parallel_4.4.0 cluster_2.1.6 R6_2.5.1
## [40] stringi_1.8.4 bslib_0.7.0 RColorBrewer_1.1-3
## [43] jquerylib_0.1.4 cellranger_1.1.0 Rcpp_1.0.12
## [46] iterators_1.0.14 knitr_1.46 timechange_0.3.0
## [49] Matrix_1.7-0 tidyselect_1.2.1 rstudioapi_0.16.0
## [52] abind_1.4-5 yaml_2.3.8 doParallel_1.0.17
## [55] codetools_0.2-19 curl_5.2.1 plyr_1.8.9
## [58] lattice_0.22-6 withr_3.0.0 KEGGREST_1.44.0
## [61] evaluate_0.23 BiocFileCache_2.12.0 xml2_1.3.6
## [64] circlize_0.4.16 ExperimentHub_2.12.0 Biostrings_2.72.0
## [67] pillar_1.9.0 BiocManager_1.30.23 filelock_1.0.3
## [70] foreach_1.5.2 generics_0.1.3 vroom_1.6.5
## [73] BiocVersion_3.19.1 hms_1.1.3 munsell_0.5.1
## [76] scales_1.3.0 glue_1.7.0 tools_4.4.0
## [79] AnnotationHub_3.12.0 Cairo_1.6-2 grid_4.4.0
## [82] AnnotationDbi_1.66.0 colorspace_2.1-0 GenomeInfoDbData_1.2.12
## [85] cli_3.6.2 rappdirs_0.3.3 fansi_1.0.6
## [88] S4Arrays_1.4.1 ComplexHeatmap_2.20.0 gtable_0.3.5
## [91] sass_0.4.9 digest_0.6.35 SparseArray_1.4.5
## [94] ggrepel_0.9.5 farver_2.1.2 rjson_0.2.21
## [97] memoise_2.0.1 htmltools_0.5.8.1 lifecycle_1.0.4
## [100] httr_1.4.7 mime_0.12 GlobalOptions_0.1.2
## [103] bit64_4.0.5